Code
library(tidyverse)
library(ggplot2)
library(car)
library(caret)
library(rFerns)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)
library(Hmisc)
library(mlbench)
library(pROC)library(tidyverse)
library(ggplot2)
library(car)
library(caret)
library(rFerns)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)
library(Hmisc)
library(mlbench)
library(pROC)qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese"),
Religion=relevel(Religion,ref="None")) |>
mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
"$10,000 - $19,999" ~"Below",
"$20,000 - $29,999"~"Below",
"$30,000 - $39,999"~"Below",
"$40,000 - $49,999"~"Below",
"$50,000 - $59,999"~"Below",
"$60,000 - $69,999"~"Above",
"$70,000 and over"~"Above",
.default=Income)) |>
mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |>
mutate(across(`Familiarity with America`:`Familiarity with Ethnic Origin`,~factor(.x,levels=c("Very low","Low", "High", "Very high"))),
across(`Identify Ethnically`,~factor(.x,levels=c("Not at all","Not very close","Somewhat close","Very close"))),
across(`Belonging`,~factor(.x,levels=c("Not at all","Not very much","Somewhat","Very much"))),
`Primary Language` = as.factor(`Primary Language`))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
We will be using the interpretation set to run an analysis of deviance to check model performance.
rfdata <- qol |> select(`Unmet Health Need`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Unmet.Health.Need)| Characteristic | N = 1,9741 |
|---|---|
| Unmet.Health.Need | |
| 0 | 1,770 (90%) |
| Yes | 204 (10%) |
| 1 n (%) | |
training_prop <- 0.8
set.seed(14)
train_index <- createDataPartition(y=rfdata$Unmet.Health.Need,
p=training_prop,
list=F)
training_set <- rfdata[train_index,]
test_set <- rfdata[-train_index,]
training_set |> gtsummary::tbl_summary(include=Unmet.Health.Need)| Characteristic | N = 1,5801 |
|---|---|
| Unmet.Health.Need | |
| 0 | 1,416 (90%) |
| Yes | 164 (10%) |
| 1 n (%) | |
test_set |> gtsummary::tbl_summary(include=Unmet.Health.Need)| Characteristic | N = 3941 |
|---|---|
| Unmet.Health.Need | |
| 0 | 354 (90%) |
| Yes | 40 (10%) |
| 1 n (%) | |
n_minority <- training_set |> filter(Unmet.Health.Need=="Yes")|> nrow()
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = training_set[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
VSURF(Unmet.Health.Need~.,
training_set,
na.action="na.omit",
parallel=T,verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Unmet.Health.Need ~ ., training_set, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 6.7 secs
VSURF selected:
18 variables at thresholding step (in 3.8 secs)
1 variables at interpretation step (in 2.5 secs)
1 variables at prediction step (in 0.5 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Duration.of.Residency"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Duration.of.Residency"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.106962
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Duration.of.Residency 0.01080 0.00041 black
2 English.Speaking 0.01053 0.00050 black
3 Age 0.00873 0.00065 black
4 Discrimination 0.00850 0.00045 black
5 Ethnicity 0.00641 0.00051 red
6 English.Difficulties 0.00615 0.00045 black
7 Dental.Insurance 0.00527 0.00055 black
8 Religion 0.00385 0.00041 black
9 Belonging 0.00261 0.00037 black
10 Familiarity.with.America 0.00194 0.00050 black
11 US.Born 0.00176 0.00023 black
12 Income_median 0.00155 0.00037 black
13 Full.Time.Employment 0.00143 0.00024 black
14 Identify.Ethnically 0.00133 0.00036 black
15 Health.Insurance 0.00124 0.00038 black
16 Primary.Language 0.00101 0.00030 black
17 Gender 0.00070 0.00025 black
18 Familiarity.with.Ethnic.Origin 0.00066 0.00037 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_unmethealth.png", width=6, height=4.5,units="in")all_formula <- Unmet.Health.Need~.
pred_vars <- names(rfdata[,-1])[vsurf.mod$varselect.interp]
pred_vars <- if("Ethnicity" %in% pred_vars){
pred_vars
} else {
c(pred_vars,"Ethnicity")
}
mod_form <- reformulate(pred_vars, response="Unmet.Health.Need")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Health.Need")
options(contrasts = c("contr.sum","contr.poly"))
mod1 <- glm(mod_form, family="binomial", data=training_set)
mod2 <- glm(mod_form_noEth, family="binomial", data=training_set)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Unmet.Health.Need ~ Duration.of.Residency
Model 2: Unmet.Health.Need ~ Duration.of.Residency + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1578 1052.5
2 1573 1006.8 5 45.709 1.041e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 1058.393 1067.276
car::Anova(mod1, test.statistic = "F")Analysis of Deviance Table (Type II tests)
Response: Unmet.Health.Need
Error estimate based on Pearson residuals
Sum Sq Df F value Pr(>F)
Duration.of.Residency 5.20 1 5.0129 0.0253 *
Ethnicity 45.71 5 8.8196 2.882e-08 ***
Residuals 1630.47 1573
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = training_set)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.131584 0.155408 -13.716 < 2e-16 ***
Duration.of.Residency -0.016066 0.007193 -2.234 0.02551 *
Ethnicity1 0.185655 0.181354 1.024 0.30597
Ethnicity2 -0.791520 0.248527 -3.185 0.00145 **
Ethnicity3 -0.376729 0.303650 -1.241 0.21473
Ethnicity4 0.545289 0.177129 3.078 0.00208 **
Ethnicity5 -0.465909 0.393166 -1.185 0.23601
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1053.4 on 1579 degrees of freedom
Residual deviance: 1006.8 on 1573 degrees of freedom
AIC: 1020.8
Number of Fisher Scoring iterations: 5
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.0996 0.0154 Inf 0.0732 0.134
Asian Indian 0.0400 0.0104 Inf 0.0240 0.066
Filipino 0.0593 0.0192 Inf 0.0311 0.110
Korean 0.1369 0.0194 Inf 0.1030 0.180
Other 0.0545 0.0237 Inf 0.0229 0.124
Vietnamese 0.1849 0.0228 Inf 0.1443 0.234
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff") contrast estimate SE df z.ratio p.value
Chinese effect 0.186 0.181 Inf 1.024 0.3060
Asian Indian effect -0.792 0.249 Inf -3.185 0.0042
Filipino effect -0.377 0.304 Inf -1.241 0.2832
Korean effect 0.545 0.177 Inf 3.078 0.0042
Other effect -0.466 0.393 Inf -1.185 0.2832
Vietnamese effect 0.903 0.170 Inf 5.301 <.0001
Results are given on the log odds ratio (not the response) scale.
P value adjustment: fdr method for 6 tests
predictions <- mod1 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Unmet.Health.Need,predictions)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
ROC
Call:
roc.default(response = test_set$Unmet.Health.Need, predictor = predictions)
Data: predictions in 354 controls (test_set$Unmet.Health.Need 0) < 40 cases (test_set$Unmet.Health.Need Yes).
Area under the curve: 0.586
ROC |> plot()threshold <- ROC |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold[1] 0.1228186
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Unmet.Health.Need,responsepredictions,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 226 128
Yes 18 22
Accuracy : 0.6294
95% CI : (0.5797, 0.6773)
No Information Rate : 0.6193
P-Value [Acc > NIR] : 0.3596
Kappa : 0.0849
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.14667
Specificity : 0.92623
Pos Pred Value : 0.55000
Neg Pred Value : 0.63842
Prevalence : 0.38071
Detection Rate : 0.05584
Detection Prevalence : 0.10152
Balanced Accuracy : 0.53645
'Positive' Class : Yes
predictions <- mod2 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Unmet.Health.Need,predictions)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
ROC
Call:
roc.default(response = test_set$Unmet.Health.Need, predictor = predictions)
Data: predictions in 354 controls (test_set$Unmet.Health.Need 0) < 40 cases (test_set$Unmet.Health.Need Yes).
Area under the curve: 0.5696
ROC |> plot()threshold <- ROC |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold[1] 0.09568894
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Unmet.Health.Need,responsepredictions,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 56 298
Yes 1 39
Accuracy : 0.2411
95% CI : (0.1997, 0.2865)
No Information Rate : 0.8553
P-Value [Acc > NIR] : 1
Kappa : 0.031
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.11573
Specificity : 0.98246
Pos Pred Value : 0.97500
Neg Pred Value : 0.15819
Prevalence : 0.85533
Detection Rate : 0.09898
Detection Prevalence : 0.10152
Balanced Accuracy : 0.54909
'Positive' Class : Yes
unregister_dopar <- function() {
env <- foreach:::.foreachGlobals
rm(list=ls(name=env), pos=env)
}
unregister_dopar()unregister_dopar()
ctrl <- trainControl(method="cv")
model <- train(Unmet.Health.Need~., data = rfdata, method = "rf", trControl = ctrl)
summary(model) Length Class Mode
call 4 -none- call
type 1 -none- character
predicted 1974 factor numeric
err.rate 1500 -none- numeric
confusion 6 -none- numeric
votes 3948 matrix numeric
oob.times 1974 -none- numeric
classes 2 -none- character
importance 39 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 14 -none- list
y 1974 factor numeric
test 0 -none- NULL
inbag 0 -none- NULL
xNames 39 -none- character
problemType 1 -none- character
tuneValue 1 data.frame list
obsLevels 2 -none- character
param 0 -none- list
model |> print()Random Forest
1974 samples
18 predictor
2 classes: '0', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1776, 1776, 1777, 1776, 1777, 1777, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.8966621 0.00000000
20 0.8981823 0.11232402
39 0.8936189 0.08244707
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 20.
unregister_dopar()
ctrl <- trainControl(method="cv")
model <- train(Unmet.Health.Need~., data = rfdata|> dplyr::select(-Ethnicity), method = "rf", trControl = ctrl)
model |> print()Random Forest
1974 samples
17 predictor
2 classes: '0', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1776, 1777, 1777, 1777, 1776, 1777, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.8966621 0.00000000
18 0.8961596 0.06704321
34 0.8941291 0.07586865
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
rfdata <- qol |> select(`Unmet Dental Needs`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Unmet.Dental.Needs)| Characteristic | N = 1,9681 |
|---|---|
| Unmet.Dental.Needs | |
| 0 | 1,744 (89%) |
| Yes | 224 (11%) |
| 1 n (%) | |
training_prop <- 0.8
set.seed(14)
train_index <- createDataPartition(y=rfdata$Unmet.Dental.Needs,
p=training_prop,
list=F)
training_set <- rfdata[train_index,]
test_set <- rfdata[-train_index,]
training_set |> gtsummary::tbl_summary(include=Unmet.Dental.Needs)| Characteristic | N = 1,5761 |
|---|---|
| Unmet.Dental.Needs | |
| 0 | 1,396 (89%) |
| Yes | 180 (11%) |
| 1 n (%) | |
test_set |> gtsummary::tbl_summary(include=Unmet.Dental.Needs)| Characteristic | N = 3921 |
|---|---|
| Unmet.Dental.Needs | |
| 0 | 348 (89%) |
| Yes | 44 (11%) |
| 1 n (%) | |
n_minority <- training_set |> filter(Unmet.Dental.Needs=="Yes")|> nrow()
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = training_set[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
VSURF(Unmet.Dental.Needs~.,
training_set,
na.action="na.omit",
parallel=T,verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Unmet.Dental.Needs ~ ., training_set, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 6.6 secs
VSURF selected:
17 variables at thresholding step (in 4 secs)
1 variables at interpretation step (in 2.4 secs)
1 variables at prediction step (in 0.2 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Dental.Insurance"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Dental.Insurance"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.1159898
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Dental.Insurance 0.01434 0.00073 black
2 English.Speaking 0.01012 0.00058 black
3 Duration.of.Residency 0.00987 0.00064 black
4 Age 0.00910 0.00067 black
5 Religion 0.00570 0.00057 black
6 Familiarity.with.America 0.00398 0.00038 black
7 Primary.Language 0.00304 0.00032 black
8 Ethnicity 0.00271 0.00043 red
9 Income_median 0.00227 0.00043 black
10 English.Difficulties 0.00217 0.00049 black
11 Familiarity.with.Ethnic.Origin 0.00172 0.00043 black
12 US.Born 0.00157 0.00018 black
13 Discrimination 0.00156 0.00043 black
14 Belonging 0.00135 0.00042 black
15 Identify.Ethnically 0.00129 0.00036 black
16 Full.Time.Employment 0.00104 0.00035 black
17 Health.Insurance 0.00095 0.00044 black
18 Gender -0.00015 0.00018 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_unmetdental.png", width=6, height=4.5,units="in")Dental Insurance is the only variable in the interpretation set, which means Ethnicity might not be a significant predictor of unmet dental needs.
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Unmet.Dental.Needs")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Dental.Needs")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Unmet.Dental.Needs ~ Dental.Insurance
Model 2: Unmet.Dental.Needs ~ Dental.Insurance + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1966 1300.0
2 1961 1291.3 5 8.7216 0.1207
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 1344.41 1315.208
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.11153 0.09115 -23.165 <2e-16 ***
Dental.Insurance1 0.69784 0.07851 8.889 <2e-16 ***
Ethnicity1 -0.12599 0.15545 -0.810 0.4177
Ethnicity2 -0.31150 0.17526 -1.777 0.0755 .
Ethnicity3 0.17410 0.21838 0.797 0.4253
Ethnicity4 0.23328 0.14812 1.575 0.1153
Ethnicity5 -0.17960 0.30374 -0.591 0.5543
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1395.0 on 1967 degrees of freedom
Residual deviance: 1291.3 on 1961 degrees of freedom
AIC: 1305.3
Number of Fisher Scoring iterations: 5
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.0964 0.0138 Inf 0.0726 0.127
Asian Indian 0.0814 0.0139 Inf 0.0580 0.113
Filipino 0.1259 0.0266 Inf 0.0823 0.188
Korean 0.1326 0.0170 Inf 0.1026 0.170
Other 0.0919 0.0297 Inf 0.0480 0.169
Vietnamese 0.1299 0.0176 Inf 0.0991 0.168
Results are averaged over the levels of: Dental.Insurance
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff") contrast estimate SE df z.ratio p.value
Chinese effect -0.126 0.155 Inf -0.810 0.5104
Asian Indian effect -0.311 0.175 Inf -1.777 0.3458
Filipino effect 0.174 0.218 Inf 0.797 0.5104
Korean effect 0.233 0.148 Inf 1.575 0.3458
Other effect -0.180 0.304 Inf -0.591 0.5543
Vietnamese effect 0.210 0.154 Inf 1.361 0.3469
Results are averaged over the levels of: Dental.Insurance
Results are given on the log odds ratio (not the response) scale.
P value adjustment: fdr method for 6 tests
The non-significant analysis of deviance implies that Ethnicity might not be a significant predictor of unmet dental needs.
predictions <- mod1 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Unmet.Dental.Needs,predictions)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
ROC
Call:
roc.default(response = test_set$Unmet.Dental.Needs, predictor = predictions)
Data: predictions in 348 controls (test_set$Unmet.Dental.Needs 0) < 44 cases (test_set$Unmet.Dental.Needs Yes).
Area under the curve: 0.6702
ROC |> plot()threshold <- ROC |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold[1] 0.1109492
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Unmet.Dental.Needs,responsepredictions,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 226 122
Yes 15 29
Accuracy : 0.6505
95% CI : (0.601, 0.6977)
No Information Rate : 0.6148
P-Value [Acc > NIR] : 0.07996
Kappa : 0.1496
Mcnemar's Test P-Value : < 2e-16
Sensitivity : 0.19205
Specificity : 0.93776
Pos Pred Value : 0.65909
Neg Pred Value : 0.64943
Prevalence : 0.38520
Detection Rate : 0.07398
Detection Prevalence : 0.11224
Balanced Accuracy : 0.56491
'Positive' Class : Yes
predictions <- mod2 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Unmet.Dental.Needs,predictions)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
ROC
Call:
roc.default(response = test_set$Unmet.Dental.Needs, predictor = predictions)
Data: predictions in 348 controls (test_set$Unmet.Dental.Needs 0) < 44 cases (test_set$Unmet.Dental.Needs Yes).
Area under the curve: 0.6543
ROC |> plot()threshold <- ROC |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold[1] 0.1291447
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Unmet.Dental.Needs,responsepredictions,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 226 122
Yes 15 29
Accuracy : 0.6505
95% CI : (0.601, 0.6977)
No Information Rate : 0.6148
P-Value [Acc > NIR] : 0.07996
Kappa : 0.1496
Mcnemar's Test P-Value : < 2e-16
Sensitivity : 0.19205
Specificity : 0.93776
Pos Pred Value : 0.65909
Neg Pred Value : 0.64943
Prevalence : 0.38520
Detection Rate : 0.07398
Detection Prevalence : 0.11224
Balanced Accuracy : 0.56491
'Positive' Class : Yes
unregister_dopar()
ctrl <- trainControl(method="cv")
model <- train(Unmet.Dental.Needs~., data = rfdata, method = "rf", trControl = ctrl)
summary(model) Length Class Mode
call 4 -none- call
type 1 -none- character
predicted 1968 factor numeric
err.rate 1500 -none- numeric
confusion 6 -none- numeric
votes 3936 matrix numeric
oob.times 1968 -none- numeric
classes 2 -none- character
importance 39 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 14 -none- list
y 1968 factor numeric
test 0 -none- NULL
inbag 0 -none- NULL
xNames 39 -none- character
problemType 1 -none- character
tuneValue 1 data.frame list
obsLevels 2 -none- character
param 0 -none- list
model |> print()Random Forest
1968 samples
18 predictor
2 classes: '0', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1771, 1772, 1772, 1770, 1771, 1772, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.8861825 0.00000000
20 0.8806012 0.02161838
39 0.8795808 0.03184641
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
unregister_dopar()
ctrl <- trainControl(method="cv")
model <- train(Unmet.Dental.Needs~., data = rfdata|> dplyr::select(-Ethnicity), method = "rf", trControl = ctrl)
model |> print()Random Forest
1968 samples
17 predictor
2 classes: '0', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1772, 1772, 1770, 1771, 1772, 1771, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.8861844 0.00000000
18 0.8815952 0.04198948
34 0.8790571 0.04239551
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
rfdata <- qol |>
select(`Physical Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Physical.Check.up)| Characteristic | N = 1,9661 |
|---|---|
| Physical.Check.up | |
| 0 | 630 (32%) |
| Yes | 1,336 (68%) |
| 1 n (%) | |
training_prop <- 0.8
set.seed(14)
train_index <- createDataPartition(y=rfdata$Physical.Check.up,
p=training_prop,
list=F)
training_set <- rfdata[train_index,]
test_set <- rfdata[-train_index,]
training_set |> gtsummary::tbl_summary(include=Physical.Check.up)| Characteristic | N = 1,5731 |
|---|---|
| Physical.Check.up | |
| 0 | 504 (32%) |
| Yes | 1,069 (68%) |
| 1 n (%) | |
test_set |> gtsummary::tbl_summary(include=Physical.Check.up)| Characteristic | N = 3931 |
|---|---|
| Physical.Check.up | |
| 0 | 126 (32%) |
| Yes | 267 (68%) |
| 1 n (%) | |
n_minority <- training_set |> filter(Physical.Check.up=="0")|> nrow()
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = training_set[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
VSURF(Physical.Check.up~.,
training_set,
na.action="na.omit",
parallel=T,verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Physical.Check.up ~ ., training_set, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 8.6 secs
VSURF selected:
16 variables at thresholding step (in 4.7 secs)
3 variables at interpretation step (in 2.7 secs)
2 variables at prediction step (in 1.1 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Duration.of.Residency" "Health.Insurance"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Duration.of.Residency" "Age" "Health.Insurance"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.2933566
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Duration.of.Residency 0.03041 0.00086 black
2 Age 0.02406 0.00115 black
3 Health.Insurance 0.01988 0.00062 black
4 Dental.Insurance 0.01257 0.00075 black
5 Ethnicity 0.00767 0.00084 red
6 Income_median 0.00571 0.00044 black
7 Religion 0.00514 0.00052 black
8 EnglishDiff 0.00469 0.00072 black
9 EnglishSpeak 0.00397 0.00050 black
10 Familiarity.with.Ethnic.Origin 0.00373 0.00056 black
11 Employment 0.00313 0.00031 black
12 Familiarity.with.America 0.00202 0.00048 black
13 Gender 0.00129 0.00047 black
14 Discrimination 0.00087 0.00035 black
15 Primary.Language 0.00081 0.00030 black
16 Belonging 0.00067 0.00050 black
17 US.Born 0.00021 0.00027 black
18 Identify.Ethnically -0.00028 0.00044 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_PC.png", width=6, height=4.5,units="in")pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Physical.Check.up")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Physical.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Physical.Check.up ~ Duration.of.Residency + Age + Health.Insurance
Model 2: Physical.Check.up ~ Duration.of.Residency + Age + Health.Insurance +
Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1962 2198.1
2 1957 2155.8 5 42.35 5.004e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 2224.021 2228.452
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.974225 0.165135 -5.900 3.64e-09 ***
Duration.of.Residency 0.039534 0.005388 7.337 2.18e-13 ***
Age 0.015793 0.003846 4.107 4.01e-05 ***
Health.Insurance1 -0.791580 0.074487 -10.627 < 2e-16 ***
Ethnicity1 -0.055909 0.105985 -0.528 0.5978
Ethnicity2 0.185691 0.112516 1.650 0.0989 .
Ethnicity3 0.462318 0.161909 2.855 0.0043 **
Ethnicity4 -0.583407 0.110034 -5.302 1.14e-07 ***
Ethnicity5 -0.279414 0.189871 -1.472 0.1411
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2466.2 on 1965 degrees of freedom
Residual deviance: 2155.8 on 1957 degrees of freedom
AIC: 2173.8
Number of Fisher Scoring iterations: 4
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.563 0.0292 Inf 0.505 0.619
Asian Indian 0.621 0.0306 Inf 0.559 0.679
Filipino 0.683 0.0413 Inf 0.598 0.758
Korean 0.431 0.0303 Inf 0.373 0.491
Other 0.507 0.0564 Inf 0.398 0.615
Vietnamese 0.641 0.0319 Inf 0.576 0.700
Results are averaged over the levels of: Health.Insurance
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff") contrast estimate SE df z.ratio p.value
Chinese effect -0.0559 0.106 Inf -0.528 0.5978
Asian Indian effect 0.1857 0.113 Inf 1.650 0.1483
Filipino effect 0.4623 0.162 Inf 2.855 0.0129
Korean effect -0.5834 0.110 Inf -5.302 <.0001
Other effect -0.2794 0.190 Inf -1.472 0.1694
Vietnamese effect 0.2707 0.124 Inf 2.184 0.0579
Results are averaged over the levels of: Health.Insurance
Results are given on the log odds ratio (not the response) scale.
P value adjustment: fdr method for 6 tests
predictions <- mod1 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Physical.Check.up,predictions)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
ROC
Call:
roc.default(response = test_set$Physical.Check.up, predictor = predictions)
Data: predictions in 126 controls (test_set$Physical.Check.up 0) < 267 cases (test_set$Physical.Check.up Yes).
Area under the curve: 0.7751
ROC |> plot()threshold <- ROC |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold[1] 0.6782272
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Physical.Check.up,responsepredictions,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 94 32
Yes 81 186
Accuracy : 0.7125
95% CI : (0.665, 0.7567)
No Information Rate : 0.5547
P-Value [Acc > NIR] : 9.406e-11
Kappa : 0.4014
Mcnemar's Test P-Value : 6.318e-06
Sensitivity : 0.8532
Specificity : 0.5371
Pos Pred Value : 0.6966
Neg Pred Value : 0.7460
Prevalence : 0.5547
Detection Rate : 0.4733
Detection Prevalence : 0.6794
Balanced Accuracy : 0.6952
'Positive' Class : Yes
predictions <- mod2 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Physical.Check.up,predictions)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
ROC
Call:
roc.default(response = test_set$Physical.Check.up, predictor = predictions)
Data: predictions in 126 controls (test_set$Physical.Check.up 0) < 267 cases (test_set$Physical.Check.up Yes).
Area under the curve: 0.7543
ROC |> plot()threshold <- ROC |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold[1] 0.677257
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Physical.Check.up,responsepredictions,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 94 32
Yes 83 184
Accuracy : 0.7074
95% CI : (0.6597, 0.7519)
No Information Rate : 0.5496
P-Value [Acc > NIR] : 1.033e-10
Kappa : 0.3932
Mcnemar's Test P-Value : 3.124e-06
Sensitivity : 0.8519
Specificity : 0.5311
Pos Pred Value : 0.6891
Neg Pred Value : 0.7460
Prevalence : 0.5496
Detection Rate : 0.4682
Detection Prevalence : 0.6794
Balanced Accuracy : 0.6915
'Positive' Class : Yes
unregister_dopar()
ctrl <- trainControl(method="cv")
model <- train(Physical.Check.up~., data = rfdata, method = "rf", trControl = ctrl)
summary(model) Length Class Mode
call 4 -none- call
type 1 -none- character
predicted 1966 factor numeric
err.rate 1500 -none- numeric
confusion 6 -none- numeric
votes 3932 matrix numeric
oob.times 1966 -none- numeric
classes 2 -none- character
importance 39 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 14 -none- list
y 1966 factor numeric
test 0 -none- NULL
inbag 0 -none- NULL
xNames 39 -none- character
problemType 1 -none- character
tuneValue 1 data.frame list
obsLevels 2 -none- character
param 0 -none- list
model |> print()Random Forest
1966 samples
18 predictor
2 classes: '0', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1769, 1770, 1769, 1769, 1770, 1769, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.7279188 0.2632947
20 0.7202424 0.3081309
39 0.7085543 0.2792593
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
unregister_dopar()
ctrl <- trainControl(method="cv")
model <- train(Physical.Check.up~., data = rfdata|> dplyr::select(-Ethnicity), method = "rf", trControl = ctrl)
model |> print()Random Forest
1966 samples
17 predictor
2 classes: '0', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1770, 1769, 1770, 1769, 1769, 1770, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.7085621 0.2076427
18 0.7141485 0.2952116
34 0.7090671 0.2868507
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 18.
rfdata <- qol |>
select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Dentist.Check.up)| Characteristic | N = 1,9611 |
|---|---|
| Dentist.Check.up | |
| 0 | 811 (41%) |
| Yes | 1,150 (59%) |
| 1 n (%) | |
training_prop <- 0.8
set.seed(14)
train_index <- createDataPartition(y=rfdata$Dentist.Check.up,
p=training_prop,
list=F)
training_set <- rfdata[train_index,]
test_set <- rfdata[-train_index,]
training_set |> gtsummary::tbl_summary(include=Dentist.Check.up)| Characteristic | N = 1,5691 |
|---|---|
| Dentist.Check.up | |
| 0 | 649 (41%) |
| Yes | 920 (59%) |
| 1 n (%) | |
test_set |> gtsummary::tbl_summary(include=Dentist.Check.up)| Characteristic | N = 3921 |
|---|---|
| Dentist.Check.up | |
| 0 | 162 (41%) |
| Yes | 230 (59%) |
| 1 n (%) | |
n_minority <- training_set |> filter(Dentist.Check.up=="0") |> nrow()
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = training_set[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
VSURF(Dentist.Check.up~.,
training_set,
na.action="na.omit",
parallel=T,verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Dentist.Check.up ~ ., training_set, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 8.3 secs
VSURF selected:
18 variables at thresholding step (in 4.6 secs)
3 variables at interpretation step (in 3 secs)
1 variables at prediction step (in 0.8 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Dental.Insurance"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Dental.Insurance" "Duration.of.Residency" "Religion"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.2981517
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Dental.Insurance 0.05190 0.00096 black
2 Duration.of.Residency 0.04496 0.00083 black
3 Religion 0.01423 0.00078 black
4 Ethnicity 0.01252 0.00063 red
5 Age 0.00839 0.00080 black
6 EnglishSpeak 0.00601 0.00076 black
7 Income_median 0.00590 0.00051 black
8 Health.Insurance 0.00571 0.00055 black
9 Identify.Ethnically 0.00390 0.00045 black
10 Familiarity.with.America 0.00329 0.00044 black
11 Gender 0.00302 0.00043 black
12 Familiarity.with.Ethnic.Origin 0.00271 0.00050 black
13 Employment 0.00208 0.00039 black
14 EnglishDiff 0.00185 0.00024 black
15 Primary.Language 0.00160 0.00033 black
16 Belonging 0.00158 0.00045 black
17 US.Born 0.00098 0.00023 black
18 Discrimination 0.00088 0.00042 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_DC.png", width=6, height=4.5,units="in")pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Dentist.Check.up")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Dentist.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Dentist.Check.up ~ Dental.Insurance + Duration.of.Residency +
Religion
Model 2: Dentist.Check.up ~ Dental.Insurance + Duration.of.Residency +
Religion + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1952 2209.7
2 1947 2201.0 5 8.7418 0.1198
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 2307.098 2277.933
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.542307 0.115283 -4.704 2.55e-06 ***
Dental.Insurance1 -0.804673 0.054352 -14.805 < 2e-16 ***
Duration.of.Residency 0.045686 0.004692 9.738 < 2e-16 ***
Religion1 0.098471 0.158743 0.620 0.535
Religion2 0.344886 0.178252 1.935 0.053 .
Religion3 0.225075 0.166045 1.356 0.175
Religion4 -0.193586 0.240057 -0.806 0.420
Religion5 -0.152920 0.312546 -0.489 0.625
Religion6 -0.423634 0.345307 -1.227 0.220
Ethnicity1 0.272980 0.132916 2.054 0.040 *
Ethnicity2 -0.225308 0.241135 -0.934 0.350
Ethnicity3 0.197420 0.177079 1.115 0.265
Ethnicity4 -0.141224 0.137876 -1.024 0.306
Ethnicity5 -0.045418 0.202181 -0.225 0.822
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2659.6 on 1960 degrees of freedom
Residual deviance: 2201.0 on 1947 degrees of freedom
AIC: 2229
Number of Fisher Scoring iterations: 4
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.612 0.0388 Inf 0.534 0.685
Asian Indian 0.489 0.0555 Inf 0.383 0.597
Filipino 0.594 0.0520 Inf 0.489 0.690
Korean 0.510 0.0431 Inf 0.426 0.594
Other 0.534 0.0590 Inf 0.419 0.646
Vietnamese 0.531 0.0415 Inf 0.450 0.611
Results are averaged over the levels of: Dental.Insurance, Religion
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff", type="response") |> summary(infer=T) contrast odds.ratio SE df asymp.LCL asymp.UCL null z.ratio
Chinese effect 1.314 0.175 Inf 0.925 1.87 1 2.054
Asian Indian effect 0.798 0.192 Inf 0.423 1.51 1 -0.934
Filipino effect 1.218 0.216 Inf 0.764 1.94 1 1.115
Korean effect 0.868 0.120 Inf 0.604 1.25 1 -1.024
Other effect 0.956 0.193 Inf 0.561 1.63 1 -0.225
Vietnamese effect 0.943 0.130 Inf 0.656 1.36 1 -0.425
p.value
0.2400
0.5252
0.5252
0.5252
0.8223
0.8048
Results are averaged over the levels of: Dental.Insurance, Religion
Confidence level used: 0.95
Conf-level adjustment: bonferroni method for 6 estimates
Intervals are back-transformed from the log odds ratio scale
P value adjustment: fdr method for 6 tests
Tests are performed on the log odds ratio scale
predictions <- mod1 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Dentist.Check.up,predictions)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
ROC
Call:
roc.default(response = test_set$Dentist.Check.up, predictor = predictions)
Data: predictions in 162 controls (test_set$Dentist.Check.up 0) < 230 cases (test_set$Dentist.Check.up Yes).
Area under the curve: 0.7827
ROC |> plot()threshold <- ROC |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold[1] 0.5187899
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Dentist.Check.up,responsepredictions,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 106 56
Yes 41 189
Accuracy : 0.7526
95% CI : (0.7067, 0.7945)
No Information Rate : 0.625
P-Value [Acc > NIR] : 5.29e-08
Kappa : 0.4827
Mcnemar's Test P-Value : 0.1552
Sensitivity : 0.7714
Specificity : 0.7211
Pos Pred Value : 0.8217
Neg Pred Value : 0.6543
Prevalence : 0.6250
Detection Rate : 0.4821
Detection Prevalence : 0.5867
Balanced Accuracy : 0.7463
'Positive' Class : Yes
predictions <- mod2 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Dentist.Check.up,predictions)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
ROC
Call:
roc.default(response = test_set$Dentist.Check.up, predictor = predictions)
Data: predictions in 162 controls (test_set$Dentist.Check.up 0) < 230 cases (test_set$Dentist.Check.up Yes).
Area under the curve: 0.7767
ROC |> plot()threshold <- ROC |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold[1] 0.5253235
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Dentist.Check.up,responsepredictions,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 105 57
Yes 40 190
Accuracy : 0.7526
95% CI : (0.7067, 0.7945)
No Information Rate : 0.6301
P-Value [Acc > NIR] : 1.581e-07
Kappa : 0.4817
Mcnemar's Test P-Value : 0.1043
Sensitivity : 0.7692
Specificity : 0.7241
Pos Pred Value : 0.8261
Neg Pred Value : 0.6481
Prevalence : 0.6301
Detection Rate : 0.4847
Detection Prevalence : 0.5867
Balanced Accuracy : 0.7467
'Positive' Class : Yes
unregister_dopar()
ctrl <- trainControl(method="cv")
model <- train(Dentist.Check.up~., data = rfdata, method = "rf", trControl = ctrl)
summary(model) Length Class Mode
call 4 -none- call
type 1 -none- character
predicted 1961 factor numeric
err.rate 1500 -none- numeric
confusion 6 -none- numeric
votes 3922 matrix numeric
oob.times 1961 -none- numeric
classes 2 -none- character
importance 39 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 14 -none- list
y 1961 factor numeric
test 0 -none- NULL
inbag 0 -none- NULL
xNames 39 -none- character
problemType 1 -none- character
tuneValue 1 data.frame list
obsLevels 2 -none- character
param 0 -none- list
model |> print()Random Forest
1961 samples
18 predictor
2 classes: '0', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1765, 1765, 1765, 1765, 1765, 1764, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.7241091 0.4182409
20 0.7037165 0.3884665
39 0.7027064 0.3843739
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
unregister_dopar()
ctrl <- trainControl(method="cv")
model <- train(Dentist.Check.up~., data = rfdata|> dplyr::select(-Ethnicity), method = "rf", trControl = ctrl)
model |> print()Random Forest
1961 samples
17 predictor
2 classes: '0', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1765, 1765, 1765, 1765, 1765, 1764, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.7210246 0.4118714
18 0.7006319 0.3817943
34 0.6934813 0.3665140
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
rfdata <- qol |> select(`Folkmedicine`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Folkmedicine)| Characteristic | N = 1,9471 |
|---|---|
| Folkmedicine | |
| 0 | 1,681 (86%) |
| Yes | 266 (14%) |
| 1 n (%) | |
training_prop <- 0.8
set.seed(14)
train_index <- createDataPartition(y=rfdata$Folkmedicine,
p=training_prop,
list=F)
training_set <- rfdata[train_index,]
test_set <- rfdata[-train_index,]
training_set |> gtsummary::tbl_summary(include=Folkmedicine)| Characteristic | N = 1,5581 |
|---|---|
| Folkmedicine | |
| 0 | 1,345 (86%) |
| Yes | 213 (14%) |
| 1 n (%) | |
test_set |> gtsummary::tbl_summary(include=Folkmedicine)| Characteristic | N = 3891 |
|---|---|
| Folkmedicine | |
| 0 | 336 (86%) |
| Yes | 53 (14%) |
| 1 n (%) | |
n_minority <- training_set |> filter(Folkmedicine=="Yes") |> nrow()
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = training_set[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
VSURF(Folkmedicine~.,
training_set,
na.action="na.omit",
parallel=T,
verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Folkmedicine ~ ., training_set, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 8.1 secs
VSURF selected:
18 variables at thresholding step (in 3.9 secs)
3 variables at interpretation step (in 2.7 secs)
2 variables at prediction step (in 1.5 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Age" "Ethnicity"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Age" "Duration.of.Residency" "Ethnicity"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.1379974
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Age 0.01810 0.00092 black
2 Duration.of.Residency 0.01312 0.00086 black
3 Ethnicity 0.00808 0.00050 red
4 English.Speaking 0.00702 0.00055 black
5 Income_median 0.00463 0.00036 black
6 Religion 0.00401 0.00081 black
7 English.Difficulties 0.00256 0.00058 black
8 Full.Time.Employment 0.00239 0.00046 black
9 Familiarity.with.America 0.00198 0.00043 black
10 Familiarity.with.Ethnic.Origin 0.00180 0.00040 black
11 Primary.Language 0.00134 0.00031 black
12 Discrimination 0.00117 0.00038 black
13 Health.Insurance 0.00089 0.00023 black
14 Dental.Insurance 0.00076 0.00019 black
15 US.Born 0.00069 0.00016 black
16 Gender 0.00044 0.00036 black
17 Belonging 0.00033 0.00033 black
18 Identify.Ethnically 0.00027 0.00035 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_AltMedicine.png", width=6, height=4.5,units="in")pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp])
mod_form <- reformulate(pred_vars, response="Folkmedicine")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Folkmedicine")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Folkmedicine ~ Age + Duration.of.Residency
Model 2: Folkmedicine ~ Age + Duration.of.Residency + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1944 1507.7
2 1939 1450.9 5 56.75 5.693e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 1511.497 1530.377
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.135528 0.204648 -15.322 < 2e-16 ***
Age 0.022611 0.004636 4.877 1.08e-06 ***
Duration.of.Residency 0.007417 0.005960 1.244 0.2134
Ethnicity1 0.522926 0.131708 3.970 7.18e-05 ***
Ethnicity2 -0.286686 0.173196 -1.655 0.0979 .
Ethnicity3 -0.219541 0.217195 -1.011 0.3121
Ethnicity4 0.757469 0.133299 5.682 1.33e-08 ***
Ethnicity5 -0.212500 0.287863 -0.738 0.4604
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1552.9 on 1946 degrees of freedom
Residual deviance: 1450.9 on 1939 degrees of freedom
AIC: 1466.9
Number of Fisher Scoring iterations: 5
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.1741 0.0176 Inf 0.1422 0.2113
Asian Indian 0.0857 0.0142 Inf 0.0617 0.1180
Filipino 0.0912 0.0203 Inf 0.0585 0.1394
Korean 0.2104 0.0208 Inf 0.1725 0.2541
Other 0.0917 0.0280 Inf 0.0496 0.1634
Vietnamese 0.0665 0.0125 Inf 0.0459 0.0955
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff", type="response") |> summary(infer=T) contrast odds.ratio SE df asymp.LCL asymp.UCL null z.ratio
Chinese effect 1.687 0.222 Inf 1.192 2.388 1 3.970
Asian Indian effect 0.751 0.130 Inf 0.475 1.186 1 -1.655
Filipino effect 0.803 0.174 Inf 0.453 1.424 1 -1.011
Korean effect 2.133 0.284 Inf 1.500 3.032 1 5.682
Other effect 0.809 0.233 Inf 0.378 1.728 1 -0.738
Vietnamese effect 0.570 0.105 Inf 0.351 0.927 1 -3.053
p.value
0.0002
0.1468
0.3745
<.0001
0.4604
0.0045
Confidence level used: 0.95
Conf-level adjustment: bonferroni method for 6 estimates
Intervals are back-transformed from the log odds ratio scale
P value adjustment: fdr method for 6 tests
Tests are performed on the log odds ratio scale
predictions <- mod1 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Folkmedicine,predictions)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
ROC
Call:
roc.default(response = test_set$Folkmedicine, predictor = predictions)
Data: predictions in 336 controls (test_set$Folkmedicine 0) < 53 cases (test_set$Folkmedicine Yes).
Area under the curve: 0.6865
ROC |> plot()threshold <- ROC |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold[1] 0.1925455
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Folkmedicine,responsepredictions,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 273 63
Yes 26 27
Accuracy : 0.7712
95% CI : (0.7262, 0.812)
No Information Rate : 0.7686
P-Value [Acc > NIR] : 0.4803199
Kappa : 0.2488
Mcnemar's Test P-Value : 0.0001356
Sensitivity : 0.30000
Specificity : 0.91304
Pos Pred Value : 0.50943
Neg Pred Value : 0.81250
Prevalence : 0.23136
Detection Rate : 0.06941
Detection Prevalence : 0.13625
Balanced Accuracy : 0.60652
'Positive' Class : Yes
predictions <- mod2 |> predict(newdata=test_set,type="response")
ROC <- roc(test_set$Folkmedicine,predictions)Setting levels: control = 0, case = Yes
Setting direction: controls < cases
ROC
Call:
roc.default(response = test_set$Folkmedicine, predictor = predictions)
Data: predictions in 336 controls (test_set$Folkmedicine 0) < 53 cases (test_set$Folkmedicine Yes).
Area under the curve: 0.5989
ROC |> plot()threshold <- ROC |> coords("best", ret = "threshold", best.method = "youden") |> as.numeric()
threshold[1] 0.1389611
responsepredictions <- ifelse(predictions > threshold, "Yes","0") |> factor(levels=c("0","Yes"))
confusionMatrix(test_set$Folkmedicine,responsepredictions,positive="Yes")Confusion Matrix and Statistics
Reference
Prediction 0 Yes
0 224 112
Yes 24 29
Accuracy : 0.6504
95% CI : (0.6007, 0.6978)
No Information Rate : 0.6375
P-Value [Acc > NIR] : 0.3189
Kappa : 0.1258
Mcnemar's Test P-Value : 8.64e-14
Sensitivity : 0.20567
Specificity : 0.90323
Pos Pred Value : 0.54717
Neg Pred Value : 0.66667
Prevalence : 0.36247
Detection Rate : 0.07455
Detection Prevalence : 0.13625
Balanced Accuracy : 0.55445
'Positive' Class : Yes
unregister_dopar()
ctrl <- trainControl(method="cv")
model <- train(Folkmedicine~., data = rfdata, method = "rf", trControl = ctrl)
summary(model) Length Class Mode
call 4 -none- call
type 1 -none- character
predicted 1947 factor numeric
err.rate 1500 -none- numeric
confusion 6 -none- numeric
votes 3894 matrix numeric
oob.times 1947 -none- numeric
classes 2 -none- character
importance 39 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 14 -none- list
y 1947 factor numeric
test 0 -none- NULL
inbag 0 -none- NULL
xNames 39 -none- character
problemType 1 -none- character
tuneValue 1 data.frame list
obsLevels 2 -none- character
param 0 -none- list
model |> print()Random Forest
1947 samples
18 predictor
2 classes: '0', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1753, 1752, 1752, 1753, 1753, 1752, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.8633836 0.00000000
20 0.8592757 0.04589079
39 0.8582474 0.05946169
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
unregister_dopar()
ctrl <- trainControl(method="cv")
model <- train(Folkmedicine~., data = rfdata|> dplyr::select(-Ethnicity), method = "rf", trControl = ctrl)
model |> print()Random Forest
1947 samples
17 predictor
2 classes: '0', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 1753, 1753, 1752, 1752, 1753, 1752, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.8633836 0.00000000
18 0.8633862 0.07901580
34 0.8602987 0.05914409
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 18.